library(yaml)
library(foreign)
library(plyr)
library(dplyr)
library(readr)
library(tidyverse)
library(data.table)
library(readstata13)

### Load Path
CONFIG       <- yaml.load_file("config_global.yaml")
driver_path  <- CONFIG$source$raw$drivers
build_path   <- CONFIG$build$descriptive

### Laod Lib Functions
source(sprintf("%s/library.r", CONFIG$source$lib))

### Load Constants
countries          <- c("United States", "Canada", "United Kingdom","Germany", "Australia", "New Zealand",
                        "Norway", "Sweden", "Switzerland", "Denmark", "France", "Japan")
country_codes      <- c("USA", "CAN", "GBR", "DEU", "AUS", "NZL", "NOR", "SWE", "CHE", "DNK", "FRA", "JPN")
country_polar      <- c("united_states", "canada", "britain", "germany", "australia", "new_zealand",
                        "norway", "sweden", "switzerland", "denmark", "france", "japan")
country_map        <- cbind(countries, country_codes, country_polar) %>% as.data.frame()
names(country_map) <- c("country", "code", "polar")

### Main
main <- function(){

  # Elite polarization
  elite_data <- get_rehm_reilly(countries)
  
  # Party-ideology sorting
  sorting_data <- get_sorting_data(countries, country_map)

  # Ethnic Fractionalization
  hief_data <- get_hief_data(countries)
  
  # Private 24 hours news
  tv_data <- get_24hr_news()
  
  # Social media/internet news
  social_data <- get_social_news()
  
  # Internet
  int_data         <- load_worldbank_data("internet", sprintf("%s/internet/data.csv", driver_path), countries)

  # Trade
  trade_data       <- load_worldbank_data("trade", sprintf("%s/trade/data.csv", driver_path), countries)
  
  # Inequality
  ineq_data1       <- read.csv(sprintf("%s/inequality/data1.csv", driver_path))
  ineq_data2       <- load_worldbank_data("inequality", sprintf("%s/inequality/data2.csv", driver_path), countries, source=TRUE)
  ineq_data        <- get_ineq_data(ineq_data1, ineq_data2, countries)[[1]]
  ineq_source_corr <- get_ineq_data(ineq_data1, ineq_data2, countries)[[2]] %>% as.matrix()
  ineq_source_corr %>% write_gslab_table(sprintf("%s/ineq_source_corr.txt", build_path), "<tab:ineq_source_corr>")

  # Immigration
  imm_data_pct     <- read.csv(sprintf("%s/immigration/percentage.csv", driver_path)) %>%
                            filter(!(LOCATION == "DEU" & TIME == 2004)) # Drop odd zero
  imm_data_stock   <- read.csv(sprintf("%s/immigration/stock1.csv", driver_path))
  imm_data_fjn     <- load_worldbank_data("stock", sprintf("%s/immigration/stock2.csv", driver_path), 
                                          c("France", "Japan", "New Zealand"))
  pop_data         <- load_worldbank_data("population",sprintf("%s/immigration/population.csv", driver_path), countries)
  imm_data         <- get_imm_data(imm_data_pct, imm_data_stock, imm_data_fjn, pop_data, country_map, rescale = TRUE)

  # Nonwhite Share (HIEF)
  nws_alt <- get_hief_nonwhite(countries)
  
  # Polarization
  polar_data_raw   <- read.csv(sprintf("%s/data.csv", build_path))
  polar_data       <- get_polar_data(polar_data_raw, country_map)
  
  # Save data
  DATA <- list("Affective polarization"       = polar_data   %>% select(country, years, partisanaffect = partisan_affect_polarization),
               "Inequality (Gini)"            = ineq_data    %>% select(country, years, inequality),
               "Priv. 24-hr TV news (share)"  = tv_data      %>% select(country, years = year, tvmax = max_share),
               "Priv. 24-hr TV news (count)"  = tv_data      %>% select(country, years = year, tvcount = total_outlets),
               "Share getting news online"    = social_data  %>% filter(source == "online (including social media)") %>%
                 select(country, years = year, onlineshare = share),
               "Foreign-born share"           = imm_data     %>% select(country, years, shareforeign = immigration),
               "Non-white share"              = nws_alt      %>% select(country = Country, years = Year, sharenotwhite = share_nonwhite),
               "Trade share of GDP"           = trade_data   %>% select(country, years, trade),
               "Internet penetration"         = int_data     %>% select(country, years, internet),
               "Ethnic fractionalization"     = hief_data    %>% select(country = Country, years = Year, ethnicfractionalization = EFindexrep),
               "Ethnic polarization"          = hief_data    %>% select(country = Country, years = Year, ethnicpolarization = EPindexrep),
               "Partisan-ideological sorting" = sorting_data %>% select(country, years, partisansorting = value),
               "Elite polarization"           = elite_data   %>% select(country, years = year, elitepolarization = value))
  
  DATA %>% saveRDS(sprintf("%s/drivers.rds", build_path))
  for (driver in names(DATA)) { DATA[[driver]] %>% as.data.frame() %>% fwrite(sprintf("%s/drivers_%s.csv", build_path, driver)) }
  
}


### Main Functions
get_rehm_reilly <- function(countries){
  data       <- read.dta13("source/raw/drivers/rehmreilly/robin_best_polarization.dta")
  data$ccode <- as.character(data$ccode)
  data$ccode[data$ccode == "USA"] <- "United States"
  data$ccode[data$ccode == "UK"]  <- "United Kingdom"
  
  data <- data[data$ccode %in% countries, ]
  data <- data[!is.na(data$adj_P_expert_1), ]
  data <- data[, c("ccode", "year", "adj_P_expert_1")]
  colnames(data) <- c("country", "year", "value")
  
  data$value <- data$value * 100
  data$year  <- year(data$year)
  
  return(data)
}


get_sorting_data <- function(countries, country_map){
  our_data         <- get_our_sorting_data(countries, country_map)
  wvs_data         <- get_wvs_data(countries)
  
  sorting_data <- our_data %>% bind_rows(wvs_data %>% filter(country == "Japan"))
  
  return(sorting_data)
}


get_our_sorting_data <- function(countries, country_map){
  data <- NULL
  for (country in c("australia", "britain", "canada", "denmark", "germany", "japan", "france", "new_zealand", 
                    "norway", "sweden", "switzerland", "united_states", "cses")){
    db   <- read.csv(sprintf("build/prepare_data/%s.csv", country))
    db <- db %>% filter(!is.na(ideology)) %>% dplyr::mutate(country = !!country)
    
    country_years <- unique(db[, c("country", "year")])
    db_data <- c()
    for (i in 1:nrow(country_years)){
      country <- country_years$country[i]
      year    <- country_years$year[i]
      temp    <- db[db$country == country & db$year == year, ]
      if (sum(!is.na(temp$party)) > 0 & sum(!is.na(temp$ideology)) > 0){
        temp <- temp[!is.na(temp$party), ] 
        r_sq <- summary(lm(ideology ~ factor(party), data = temp, weights = temp$weight))$r.squared
        db_data <- rbind(db_data, c(country, as.character(year), r_sq))
      }
    }
    data <- rbind(data, db_data)
  }
  
  data <- data.frame(data, stringsAsFactors = F)
  colnames(data) <- c("country", "years", "value")
  
  # Replace with CSES and drop
  data$country[grepl("France", data$years)]           <- "france"
  data$country[grepl("Japan", data$years)]            <- "japan"
  data$country[grepl("Switzerland_2003", data$years)] <- "switzerland"
  data$years <- gsub("France_|Japan_|Switzerland_", "", data$years)
  data <- data[order(data[, c("years")]), ]
  data <- data[order(data[, c("country")]), ]
  data <- data[data$country != "cses", ]
  
  data$years <- as.numeric(as.character(data$years))
  data$value <- as.numeric(as.character(data$value))
  data$value <- data$value * 100
  
  for(i in 1:length(countries)){
    data$country[data$country == country_map$polar[i]] <- country_map$country[i]
  }

  return(data)
}

get_wvs_data <- function(countries){
  wvs <- readRDS("source/externals/wvs/WVS_TimeSeries_R_v1_6.rds")
  wvs <- wvs %>% mutate(partyvote = E179_WVS7, ideology = E033, weight = S017, year = S020, country_code = S003) %>%
    select(weight, partyvote, ideology, year, country_code)

  wvs$country <- NA
  wvs$country[wvs$country_code == 840] <- "United States"
  wvs$country[wvs$country_code == 124] <- "Canada"
  wvs$country[wvs$country_code == 826] <- "United Kingdom"
  wvs$country[wvs$country_code == 276] <- "Germany" # Check W. Germany which used 280
  wvs$country[wvs$country_code == 036] <- "Australia"
  wvs$country[wvs$country_code == 554] <- "New Zealand"
  wvs$country[wvs$country_code == 578] <- "Norway"
  wvs$country[wvs$country_code == 752] <- "Sweden"
  wvs$country[wvs$country_code == 756] <- "Switzerland"
  wvs$country[wvs$country_code == 208] <- "Denmark"
  wvs$country[wvs$country_code == 250] <- "France"
  wvs$country[wvs$country_code == 392] <- "Japan"
  wvs <- wvs %>% filter(!is.na(country)) %>% filter(ideology > 0) %>% filter(!is.na(partyvote))
  wvs <- wvs[!(wvs$partyvote == 4), ] # Drops "None" response
  
  
  country_years <- unique(wvs[, c("country", "year")])
  
  data <- c()
  for (i in 1:nrow(country_years)){
    country <- country_years$country[i]
    year    <- country_years$year[i]
    temp    <- wvs[wvs$country == country & wvs$year == year, ]
    if (sum(!is.na(temp$partyvote)) > 0){
      r_sq <- summary(lm(ideology ~ factor(partyvote), data = temp, weights = temp$weight))$r.squared
      data <- rbind(data, c(country, year, r_sq))
    }
  }
  
  data <- data.frame(data, stringsAsFactors = F)
  
  colnames(data) <- c("country", "years", "value")
  data$years <- as.numeric(as.character(data$years))
  data$value <- as.numeric(as.character(data$value))
  data$value <- data$value * 100
  
  return(data)
}

get_hief_data <- function(countries){
  data_rep <- readxl::read_xlsx("source/raw/drivers/hief/replication/HIEF_dataset-master/results/HIEF_dataset_v2.xlsx") %>%
    filter(Country %in% !!c(countries, "United States of America", "German Federal Republic")) %>%
    mutate(Country = gsub("United States of America", "United States", Country)) %>% 
    mutate(Country = gsub("German Federal Republic", "Germany", Country)) %>%
    mutate(EFindexrep = round(EF_index * 100, digits = 1),
           EPindexrep = round(EP_index * 100, digits = 1))# France is missing
  
  return(data_rep)
}

get_hief_nonwhite <- function(countries){

  data <- readxl::read_xlsx("source/raw/drivers/hief/replication/HIEF_dataset-master/data/ethnic fractionalization - original.xlsx") %>%
    filter(Country %in% !!c(countries, "United States of America", "German Federal Republic")) %>%
    mutate(Country = gsub("United States of America", "United States", Country)) %>% 
    mutate(Country = gsub("German Federal Republic", "Germany", Country))
  
  us_white <- c("white")
  ca_white <- c("british", "french", "othereuropean")
  uk_white <- c("english", "welsh", "irish", "scottish")
  ch_white <- c("swiss", "german", "french", "italian")
  de_white <- c("turk", "german", "yugoslavian", "italian")
  se_white <- c("swedish", "finnish")
  no_white <- c("norwegian")
  dk_white <- c("danish")
  jp_white <- c("")
  au_white <- c("white")
  nz_white <- c("white")
  
  data$white <- as.numeric( (data$Country == "United States"  & data$`Group Name` %in% us_white) |
                            (data$Country == "Canada"         & data$`Group Name` %in% ca_white) |
                            (data$Country == "United Kingdom" & data$`Group Name` %in% uk_white) |
                            (data$Country == "Switzerland"    & data$`Group Name` %in% ch_white) |
                            (data$Country == "Germany"        & data$`Group Name` %in% de_white) |
                            (data$Country == "Sweden"         & data$`Group Name` %in% se_white) |
                            (data$Country == "Norway"         & data$`Group Name` %in% no_white) |
                            (data$Country == "Denmark"        & data$`Group Name` %in% dk_white) |
                            (data$Country == "Japan"          & data$`Group Name` %in% jp_white) |
                            (data$Country == "Australia"      & data$`Group Name` %in% au_white) |
                            (data$Country == "New Zealand"    & data$`Group Name` %in% nz_white))
  
  
  data <- data %>% group_by(Country, Year) %>% 
            dplyr::summarize(total = sum(`Group Estimate`), white_total = sum(`Group Estimate` * white)) %>%
            dplyr::mutate(share_nonwhite = 100 *(1 - white_total / total))

  return(data)
}

get_24hr_news <- function(){
  data <- readxl::read_xlsx("source/raw/drivers/reutersnews/data.xlsx") %>% group_by(country, private_cable) %>% 
    dplyr::summarize(total_outlets = n(), total_share = sum(share), max_share = max(share)) 
  all  <- data %>% ungroup() %>% group_by(country) %>% dplyr::summarize(all_share = sum(total_share))
  data <- data %>% filter(private_cable == "yes") %>% merge(all) %>% mutate(rel_share = 100 * total_share / all_share)
    
  base <- data
  base <- base %>% mutate(total_outlets = 0, total_share = 0, max_share = 0, all_share = 0, rel_share = 0)
  base <- base %>% mutate(year = 1980)
  data <- data %>% mutate(year = 2020)
  data <- data %>% dplyr::bind_rows(base)
  
  return(data)
}

get_social_news <- function(){
  data <- readxl::read_xlsx("source/raw/drivers/reutersnews/internet_data.xlsx")
  base <- data
  base <- base %>% mutate(share = 0)
  base <- base %>% mutate(year = 1995)
  data <- data %>% mutate(year = 2020)
  data <- data %>% dplyr::bind_rows(base)
  
  return(data)
}


get_ineq_data <- function(ineq_data1, ineq_data2, countries) {
  
  ineq_data1             <- ineq_data1 %>%
                            select(-Code) %>%
                            filter(Entity %in% countries) %>%
                            drop_na() %>%
                            mutate(source = "Roser and Ortiz-Ospina (2013)")
  names(ineq_data1)[1:3] <- c("country","years","inequality")
                      
  # Merge the sources using whichever covers more years of a given country 
  ineq_data              <- data.frame(country=character(), inequality=double(), years=integer(), source=character())
  
  for (c in countries){
    
    num_years_1          <- ineq_data1 %>% filter(country==c) %>% nrow()
    num_years_2          <- ineq_data2 %>% filter(country==c) %>% nrow()
    
    if (num_years_1 >= num_years_2){
      
      ineq_data          <- ineq_data1 %>% 
                            filter(country == c) %>%
                            rbind(ineq_data, .)
    
    } else {
      
      ineq_data          <- ineq_data2 %>% 
                            filter(country == c) %>%
                            rbind(ineq_data, .)
    
    }
  }
  
  # Compute correlations between sources for overlapping years
  ineq_source_corr <- inner_join(ineq_data1, ineq_data2, by=c("country", "years")) %>%
                      group_by(country) %>% 
                      mutate(corr = cor(inequality.x, inequality.y)) %>%
                      select(country, corr) %>%
                      summarise_all(mean) %>%
                      right_join(ineq_data[,c("country","source")], by="country") %>%
                      mutate_at(vars(corr), list(~ifelse(is.na(.), "-", round(., digits = 3)))) %>%
                      mutate_all(as.character) %>%
                      distinct()
  ineq_source_corr <- ineq_source_corr[order(ineq_source_corr$country),]
  ineq_data        <- select(ineq_data, -source)
  
  return(list(ineq_data, ineq_source_corr))
}



get_imm_data <- function(imm_data_pct, imm_data_stock, imm_data_fjn, pop_data, country_map, rescale = TRUE) {
  
  imm_data_pct          <- imm_data_pct %>%
                           inner_join(country_map, by = c("LOCATION" = "code")) %>%
                           select("country","TIME","Value") %>%
                           filter(country %in% countries) %>%
                           mutate(imputed = FALSE)
  names(imm_data_pct)   <- c("country", "years", "immigration", "imputed")
  imm_data_stock        <- imm_data_stock %>% 
                           inner_join(country_map, by = c("LOCATION" = "code")) %>%
                           select("country","TIME","Value") %>%
                           filter(country %in% countries)
  names(imm_data_stock) <- c("country", "years", "stock")
  imm_data              <- imm_data_stock %>% 
                           rbind(imm_data_fjn) %>%
                           inner_join(pop_data, by=c("country", "years")) %>%
                           mutate(immigration = stock/population*100) %>%
                           select("country","years","immigration") %>%
                           mutate(imputed = TRUE) %>%
                           rbind(imm_data_pct)
  
  if (rescale == TRUE) {
    
    # Rescale direct percentage such that the averages across sources agree
    imm_data_diff       <- inner_join(imm_data[imm_data$imputed == TRUE,], 
                                      imm_data[imm_data$imputed == FALSE,], 
                                      by=c("country", "years")) %>%
                           mutate(add_cons = immigration.x - immigration.y) %>%
                           select(country, add_cons) %>%
                           group_by(country) %>%
                           summarise_all(mean)
    
  } else {
    
    imm_data_diff       <- country_map %>%
                           select(country) %>%
                           mutate(add_cons = 0)
    
  }
  
  # Merge the two data sources using imputed percentage whenever the sources overlap
  imm_data            <- imm_data %>%
                         inner_join(imm_data_diff, by="country") %>%
                         mutate_at(vars(immigration), ~(. + add_cons * (1-imputed))) %>%
                         group_by(country, years) %>% 
                         dplyr::mutate(n = n()) %>%
                         filter(n == 1 | imputed == TRUE) %>%
                         select(country, years, immigration)
  imm_data            <- imm_data[with(imm_data, order(country, years)), ]  %>% 
                         as.data.frame()
  
  return(imm_data)
}


get_polar_data <- function(polar_data_raw, country_map) {
  
  polar_data            <- polar_data_raw %>%
                           inner_join(country_map, by = c("country" = "polar")) %>%
                           select(country.y, years, partisan_affect_polarization) %>%
                           mutate_at(vars(-country.y), as.numeric)
  names(polar_data)     <- c("country", "years", "partisan_affect_polarization")
  
  return(polar_data)
}

### Execute
main()

